library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer)
library(Rtsne)
library(knitr)
library(ROCR)
library(expss)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
Load dataset (preprocessing code in 19_10_11_create_dataset.Rmd)
clustering_selected = 'DynamicHybridMergedSmall'
print(paste0('Using clustering ', clustering_selected))
## [1] "Using clustering DynamicHybridMergedSmall"
# Dataset created with DynamicTreeMerged algorithm
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'), row.names=1)
Remove genes without cluster (Module=gray)
Remove genes with Module-Trait correlation missing
rm_cluster = dataset[is.na(dataset$MTcor),clustering_selected] %>% unique %>% as.character
print(paste0('Removing ', sum(dataset[,clustering_selected]=='gray'), ' genes without cluster'))
## [1] "Removing 178 genes without cluster"
print(paste0('Removing ', sum(is.na(dataset$MTcor)), ' genes belonging to module(s) without module-trait correlation: ',
rm_cluster))
## [1] "Removing 1555 genes belonging to module(s) without module-trait correlation: #00BD64"
new_dataset = dataset %>% filter(dataset[,clustering_selected]!='gray' & !is.na(MTcor))
Using Module Membership variables instead of binary module membership
Not including p-value variables
Including a new variable with the absolute value of GS
Removing information from gray module (unclassified genes) and any other module that did not have a Module-Trait value
Objective variable: Binary label indicating if it’s in the SFARI dataset or not (including score 6)
new_dataset = new_dataset %>% dplyr::select(-c(matches(paste('pval', clustering_selected,
gsub('#','',rm_cluster), sep='|')), MMgray)) %>%
mutate('absGS'=abs(GS), 'SFARI'=ifelse(gene.score=='None', FALSE, TRUE)) %>%
dplyr::select(-gene.score)
rownames(new_dataset) = rownames(dataset)[!is.na(dataset$MTcor) & dataset[,clustering_selected]!='gray']
rm(rm_cluster)
original_dataset = dataset
dataset = new_dataset
print(paste0('The final dataset contains ', nrow(dataset), ' observations and ', ncol(dataset), ' variables.'))
## [1] "The final dataset contains 14867 observations and 38 variables."
rm(new_dataset)
Sample of the dataset
dataset %>% head %>% kable
| MTcor | GS | MM.00C1A8 | MM.4B9FFF | MM.8195FF | MM.C17FFF | MM.C49A00 | MM.D29300 | MM.FF64B2 | MM.FF699D | MM.FB61D7 | MM.00C0BB | MM.53B400 | MM.E88523 | MM.00B0F6 | MM.F365E6 | MM.FD6F86 | MM.00BBDD | MM.00B70C | MM.75AF00 | MM.DE8C00 | MM.B4A000 | MM.F8766D | MM.00B6EB | MM.00BF7D | MM.D774FD | MM.A3A500 | MM.F17E4F | MM.00C094 | MM.00A8FF | MM.00BECD | MM.A58AFF | MM.E76BF3 | MM.FF61C5 | MM.00BB45 | MM.8EAB00 | absGS | SFARI | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ENSG00000000003 | 0.7601073 | 0.3827131 | -0.3477130 | -0.1449705 | -0.1646246 | -0.2829441 | -0.3416614 | -0.1752186 | -0.1947862 | 0.0280602 | -0.1139808 | -0.1800823 | -0.2265495 | 0.0824399 | 0.1910086 | -0.0125141 | -0.0148751 | -0.1126751 | -0.0522299 | 0.1702235 | -0.0318971 | 0.5014482 | 0.3716079 | 0.1920288 | 0.3141084 | 0.3318810 | 0.4772117 | 0.4106499 | 0.1092023 | 0.1433862 | 0.1295224 | -0.0098212 | 0.3358176 | 0.1393598 | 0.1434403 | 0.2296628 | 0.3827131 | FALSE |
| ENSG00000000419 | 0.4150643 | 0.0361671 | -0.0509414 | -0.4181807 | -0.3435142 | -0.0950710 | 0.1271936 | 0.3731103 | -0.1661217 | -0.0683150 | 0.1835545 | -0.2093919 | -0.0438694 | 0.0019727 | 0.2711652 | 0.1487585 | 0.1426498 | -0.1856460 | -0.1161695 | 0.0769049 | -0.0790248 | -0.2102113 | -0.1908434 | -0.0142087 | -0.0346279 | -0.2307758 | 0.0273226 | 0.1300158 | -0.0003214 | -0.1399217 | 0.1449922 | 0.3286803 | 0.1418381 | 0.1323354 | 0.5218180 | 0.2613671 | 0.0361671 | FALSE |
| ENSG00000000457 | -0.9216750 | -0.4005675 | 0.1874271 | 0.2806782 | 0.1875352 | 0.3747611 | 0.3393944 | 0.2716503 | 0.3227764 | 0.1136580 | 0.1760050 | 0.1146922 | 0.3195916 | -0.2275923 | -0.0641184 | -0.0820587 | 0.0331912 | 0.1632690 | -0.0397420 | -0.1388166 | -0.0426981 | -0.0552223 | -0.0629920 | -0.1494477 | -0.2855344 | -0.2286590 | -0.3060407 | -0.2607013 | -0.3293047 | -0.1707641 | -0.2804889 | -0.0996501 | -0.2359453 | -0.1367952 | -0.0969217 | -0.2679999 | 0.4005675 | FALSE |
| ENSG00000000460 | -0.1165650 | -0.2386668 | 0.0081068 | 0.0453039 | -0.1677691 | 0.1659316 | 0.3632797 | 0.4758476 | 0.1413266 | 0.0073811 | 0.0232871 | -0.0620729 | 0.0492475 | 0.1279434 | 0.2559762 | 0.1658171 | 0.2708015 | 0.0877830 | -0.0212876 | -0.1033743 | -0.0918440 | -0.2760330 | -0.2148459 | -0.1636790 | -0.3609935 | -0.4116075 | -0.4156419 | -0.2123721 | -0.2596629 | -0.3395212 | -0.0682352 | 0.1426534 | 0.0297760 | 0.1965458 | 0.3086452 | -0.2279076 | 0.2386668 | FALSE |
| ENSG00000000971 | 0.8556942 | 0.6586707 | -0.2759526 | -0.3511587 | -0.2780262 | -0.3940097 | -0.4692445 | -0.2210057 | -0.1077318 | 0.0231030 | -0.0266761 | -0.2443301 | -0.2726795 | 0.0025340 | 0.0112294 | 0.2300797 | -0.1711726 | -0.3056525 | 0.0899908 | -0.0559141 | 0.0961331 | 0.0052618 | -0.1778786 | 0.2297739 | 0.1605946 | 0.2297970 | 0.4252786 | 0.2986677 | 0.8196572 | 0.4208318 | 0.5669501 | 0.0737886 | 0.1440107 | 0.2222202 | 0.2006217 | 0.0904164 | 0.6586707 | FALSE |
| ENSG00000001036 | 0.7601073 | 0.3221038 | -0.1361035 | -0.5330410 | -0.4449867 | -0.5478868 | -0.3157576 | 0.0951756 | -0.3301118 | -0.2001291 | 0.1974429 | -0.1947359 | -0.1959688 | 0.1137417 | 0.0583008 | 0.0552634 | 0.0148300 | -0.4207842 | -0.1377539 | 0.0398521 | 0.1730645 | -0.0030739 | 0.2322611 | 0.0643902 | -0.0654919 | -0.2230414 | 0.1810246 | 0.5690748 | 0.0845592 | 0.0307338 | 0.3770384 | 0.5580772 | 0.5086895 | 0.4994903 | 0.5013863 | 0.4209643 | 0.3221038 | FALSE |
Objective variable distribution: Unbalanced labels
print(table(dataset$SFARI))
##
## FALSE TRUE
## 14039 828
cat(paste0('\n',round(mean(dataset$SFARI)*100,2), '% of the observations are positive'))
##
## 5.57% of the observations are positive
Chose the t-SNE algorithm because it preserves distances
The SFARI labels is still close to the absolute value of Gene Significance. This time the MM variables seem to be grouped in 2 clusters
tsne = dataset %>% t %>% Rtsne(perplexity=10)
plot_data = data.frame('ID'=colnames(dataset), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2],
type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))
ggplotly(plot_data %>% ggplot(aes(C1, C2, color=type)) + geom_point(aes(id=ID)) +
theme_minimal() + ggtitle('t-SNE visualisation of variables'))
The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and modules with low correlation far away from the SFARI tag
mtcor_by_module = original_dataset %>% dplyr::select(matches(clustering_selected), MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')
plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')
ggplotly(plot_data %>% ggplot(aes(C1, C2, color=MTcor)) + geom_point(aes(id=ID)) +
scale_color_viridis() + theme_minimal() +
ggtitle('t-SNE of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, tsne)
# Mean Expression data
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))
# PCA
pca = dataset %>% t %>% prcomp
plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2],
'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')
p1 = plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(alpha=0.7) + scale_color_viridis() +
theme_minimal() + ggtitle('Genes coloured by Module-Diagnosis correlation') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
theme(legend.position='bottom')
p2 = plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.5) + scale_color_viridis() +
theme_minimal() + ggtitle('Genes coloured by Gene Significance') + theme(legend.position='bottom')
p3 = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(aes(alpha=alpha)) +
theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')
p3 = ggExtra::ggMarginal(p3, type='density', groupColour=TRUE, size=10)
p4 = plot_data %>% ggplot(aes(PC1, PC2, color=meanExpr)) + geom_point(alpha=0.5) + scale_color_viridis() +
theme_minimal() + ggtitle('Genes coloured by mean level of expression') + theme(legend.position='bottom')
grid.arrange(p1, p2, p3, p4, nrow=2)
rm(pca, datExpr, datGenes, datMeta, dds, DE_info, mean_expr, p1, p2, p3, p4)
For now, will do this using over- and under-sampling of the classes, but later on should check SMOTE (Synthetic Minority Over-sampling Technique) method
Need to divide first into train and test sets to keep the sets independent: using 80% of the Positive observations on the training set
positive_idx = which(dataset$SFARI)
negative_idx = which(!dataset$SFARI)
set.seed(123)
positive_test_idx = sort(sample(positive_idx, size=floor(0.2*length(positive_idx))))
positive_train_idx = positive_idx[!positive_idx %in% positive_test_idx]
set.seed(123)
negative_test_idx = sort(sample(negative_idx, size=floor(0.2*length(positive_idx))))
negative_train_idx = negative_idx[!negative_idx %in% negative_test_idx]
train_set = dataset[c(positive_train_idx,negative_train_idx),]
test_set = dataset[c(positive_test_idx,negative_test_idx),]
rm(positive_idx, negative_idx, positive_train_idx, positive_test_idx, negative_train_idx, negative_test_idx)
Over-sampling observations with positive SFARI label: Sample with replacement 4x original number of observations
Sample with replacement positive observations in train set
positive_obs = which(train_set$SFARI)
set.seed(123)
add_obs = sample(positive_obs, size=3*length(positive_obs), replace=TRUE)
train_set = train_set[c(1:nrow(train_set), add_obs),]
rm(positive_obs, add_obs)
Under-sampling observations with negative SFARI labels
print(paste0('Keeping ~',round(100*sum(train_set$SFARI)/sum(!train_set$SFARI)),
'% of the Negative observations in the training set'))
## [1] "Keeping ~19% of the Negative observations in the training set"
negative_obs = which(!train_set$SFARI)
set.seed(123)
remove_obs = sample(negative_obs, size=(sum(!train_set$SFARI)-sum(train_set$SFARI)))
train_set = train_set[-remove_obs,]
rm(negative_obs, remove_obs)
Label distribution in training set
cro(train_set$SFARI)
| #Total | |
|---|---|
| train_set$SFARI | |
| FALSE | 2652 |
| TRUE | 2652 |
| #Total cases | 5304 |
Labels distribution in test set
cro(test_set$SFARI)
| #Total | |
|---|---|
| test_set$SFARI | |
| FALSE | 165 |
| TRUE | 165 |
| #Total cases | 330 |
Train model
train_set$SFARI = train_set$SFARI %>% as.factor
fit = glm(SFARI~., data=train_set, family='binomial')
Predict labels in test set
test_set$prob = predict(fit, newdata=test_set, type='response')
test_set$pred = test_set$prob>0.5
Confusion matrix
conf_mat = test_set %>% apply_labels(SFARI = 'Actual Labels',
prob = 'Assigned Probability',
pred = 'Label Prediction')
cro(conf_mat$SFARI, list(conf_mat$pred, total()))
| Label Prediction | #Total | |||
|---|---|---|---|---|
| FALSE | TRUE | |||
| Actual Labels | ||||
| FALSE | 101 | 64 | 165 | |
| TRUE | 64 | 101 | 165 | |
| #Total cases | 165 | 165 | 330 | |
rm(conf_mat)
Accuracy
acc = mean(test_set$SFARI==test_set$pred)
print(paste0('Accuracy = ', round(acc,4)))
## [1] "Accuracy = 0.6121"
rm(acc)
ROC Curve
pred_ROCR = prediction(test_set$prob, test_set$SFARI)
roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(AUC,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')
Lift Curve
lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')
rm(pred_ROCR, roc_ROCR, AUC, lift_ROCR)
SFARI genes have a slightly higher distribution than the rest
plot_data = test_set %>% dplyr::select(prob, SFARI)
plot_data %>% ggplot(aes(prob, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) +
theme_minimal() + ggtitle('Probability distribution by SFARI Label')
Scores 1 and 2 have the highest scores!
Score 6 has a completely different behaviour to the rest, with a lower distribution even than the unlabelled genes
plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, prob) %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
dplyr::select(ID, prob, gene.score) %>% apply_labels(gene.score='SFARI Gene score')
cro(plot_data$gene.score)
| #Total | |
|---|---|
| SFARI Gene score | |
| 1 | 8 |
| 2 | 11 |
| 3 | 34 |
| 4 | 76 |
| 5 | 32 |
| 6 | 4 |
| None | 165 |
| #Total cases | 330 |
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_prob = mean(prob))
plot_data %>% ggplot(aes(prob, color=gene.score, fill=gene.score)) + geom_density(alpha=0.25) +
geom_vline(data=mean_vals, aes(xintercept=mean_prob, color=gene.score), linetype='dashed') +
scale_colour_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
ggtitle('Distribution of probabilities by SFARI score') +
xlab('Probability') + ylab('Density') + theme_minimal()
ggplotly(plot_data %>% ggplot(aes(gene.score, prob, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
ggtitle('Distribution of probabilities by SFARI score') +
xlab('SFARI score') + ylab('Probability') + theme_minimal())
rm(mean_vals)
summary(fit)
##
## Call:
## glm(formula = SFARI ~ ., family = "binomial", data = train_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2492 -1.0532 0.0199 1.0436 2.1933
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.37059 0.05727 -6.471 9.75e-11 ***
## MTcor 0.12633 0.08256 1.530 0.125962
## GS -2.76945 1.01602 -2.726 0.006415 **
## MM.00C1A8 -0.16808 0.80843 -0.208 0.835303
## MM.4B9FFF -2.24179 2.24440 -0.999 0.317873
## MM.8195FF 0.46242 2.39666 0.193 0.847003
## MM.C17FFF 8.90924 2.94143 3.029 0.002455 **
## MM.C49A00 -11.62208 3.30815 -3.513 0.000443 ***
## MM.D29300 -0.13735 2.28756 -0.060 0.952122
## MM.FF64B2 -5.58744 1.56680 -3.566 0.000362 ***
## MM.FF699D -1.71365 0.91484 -1.873 0.061044 .
## MM.FB61D7 11.05199 2.14017 5.164 2.42e-07 ***
## MM.00C0BB -4.94969 2.99814 -1.651 0.098755 .
## MM.53B400 3.76623 3.45902 1.089 0.276236
## MM.E88523 1.84402 0.68199 2.704 0.006853 **
## MM.00B0F6 0.27711 1.21039 0.229 0.818915
## MM.F365E6 -0.61250 3.31705 -0.185 0.853503
## MM.FD6F86 8.80736 4.23963 2.077 0.037766 *
## MM.00BBDD 6.18613 2.22925 2.775 0.005520 **
## MM.00B70C -5.21552 2.99197 -1.743 0.081303 .
## MM.75AF00 -0.67852 1.12138 -0.605 0.545129
## MM.DE8C00 0.95878 0.95129 1.008 0.313516
## MM.B4A000 1.96372 1.67126 1.175 0.239996
## MM.F8766D -1.76638 1.46890 -1.203 0.229161
## MM.00B6EB 3.21914 1.12566 2.860 0.004239 **
## MM.00BF7D 3.25031 1.67267 1.943 0.051993 .
## MM.D774FD 0.94612 2.02591 0.467 0.640491
## MM.A3A500 -6.20675 2.00338 -3.098 0.001947 **
## MM.F17E4F 3.97799 3.19984 1.243 0.213800
## MM.00C094 5.37684 1.56804 3.429 0.000606 ***
## MM.00A8FF -3.29108 2.90941 -1.131 0.257977
## MM.00BECD 1.44690 1.91594 0.755 0.450133
## MM.A58AFF 1.56584 1.62380 0.964 0.334892
## MM.E76BF3 2.68760 2.92319 0.919 0.357883
## MM.FF61C5 -0.94481 1.46724 -0.644 0.519618
## MM.00BB45 -7.62739 2.58661 -2.949 0.003190 **
## MM.8EAB00 -5.44687 1.88788 -2.885 0.003912 **
## absGS 0.28926 0.15747 1.837 0.066219 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7352.9 on 5303 degrees of freedom
## Residual deviance: 6614.0 on 5266 degrees of freedom
## AIC: 6690
##
## Number of Fisher Scoring iterations: 4
There doesn’t seem to be a relation between the modules’ coefficient and their correlation with Diagnosis, although there is correlation between the variables, so it’s not correct to analyse their coefficients when this happens
Also, this results changes a lot between different runs
var_fit_info = summary(fit)$coefficients %>% as.data.frame %>%
mutate(signif=`Pr(>|z|)`<0.05, ID=rownames(summary(fit)$coefficients))
plot_data = original_dataset %>% dplyr::select(matches(clustering_selected), MTcor)
colnames(plot_data) = c('ID','MTcor')
plot_data = plot_data %>% mutate(ID=gsub('#','MM.',ID)) %>% group_by(ID, MTcor) %>%
tally %>% inner_join(var_fit_info, by='ID')
ggplotly(plot_data %>% ggplot(aes(Estimate, MTcor, color=signif, size=n)) + geom_point(aes(id=ID)) +
geom_smooth(method='lm', se=FALSE) + theme_minimal() +
xlab('Coefficient in regression') + ylab('Module-Diagnosis correlation'))
rm(var_fit_info)
Strong correlations between variables
cors = cor(train_set[,-ncol(train_set)])
heatmap.2(cors, dendrogram='none', col=brewer.pal(11,'RdBu'), scale='none', trace='none')
rm(cors)
Print genes with highest scores in test set
First gene has score=1
Not a single gene with score 6
test_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(test_set)) %>%
arrange(desc(prob)) %>% top_n(50, wt=prob) %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
dplyr::select(ID, prob, SFARI, gene.score, MTcor, matches(clustering_selected)) %>%
kable
| ID | prob | SFARI | gene.score | MTcor | DynamicHybridMergedSmall |
|---|---|---|---|---|---|
| ENSG00000118058 | 0.8772247 | TRUE | 1 | 0.2020738 | #00B70C |
| ENSG00000152217 | 0.8743730 | TRUE | 3 | 0.2020738 | #00B70C |
| ENSG00000139767 | 0.8674337 | TRUE | 5 | -0.5362834 | #00C0BB |
| ENSG00000162105 | 0.8611231 | TRUE | 2 | -0.5362834 | #00C0BB |
| ENSG00000183117 | 0.8428797 | TRUE | 4 | -0.9216750 | #C17FFF |
| ENSG00000127124 | 0.8414151 | TRUE | 3 | 0.2020738 | #00B70C |
| ENSG00000170579 | 0.8389252 | TRUE | 3 | -0.9216750 | #C17FFF |
| ENSG00000197694 | 0.8388399 | FALSE | None | -0.5362834 | #00C0BB |
| ENSG00000049618 | 0.8365423 | TRUE | 1 | 0.2020738 | #00B70C |
| ENSG00000169057 | 0.8329187 | TRUE | 2 | -0.6714074 | #8195FF |
| ENSG00000116539 | 0.8326399 | TRUE | 1 | 0.2020738 | #00B70C |
| ENSG00000147010 | 0.8268095 | TRUE | 5 | 0.2020738 | #00B70C |
| ENSG00000125851 | 0.8227207 | FALSE | None | -0.6557529 | #53B400 |
| ENSG00000119866 | 0.8194519 | TRUE | 2 | -0.5362834 | #00C0BB |
| ENSG00000105409 | 0.7985797 | TRUE | 4 | -0.5362834 | #00C0BB |
| ENSG00000130477 | 0.7928487 | TRUE | 4 | -0.9216750 | #C17FFF |
| ENSG00000082701 | 0.7927872 | TRUE | 5 | -0.5362834 | #00C0BB |
| ENSG00000133958 | 0.7836568 | TRUE | 3 | 0.2020738 | #00B70C |
| ENSG00000133216 | 0.7812700 | TRUE | 4 | 0.4840783 | #00B6EB |
| ENSG00000078018 | 0.7795482 | TRUE | 5 | -0.5986466 | #4B9FFF |
| ENSG00000074054 | 0.7725906 | TRUE | 3 | -0.4966667 | #00C1A8 |
| ENSG00000197892 | 0.7718133 | TRUE | 4 | 0.2020738 | #00B70C |
| ENSG00000128815 | 0.7687111 | TRUE | 3 | 0.6236224 | #00BF7D |
| ENSG00000172292 | 0.7646495 | FALSE | None | -0.9216750 | #C17FFF |
| ENSG00000136531 | 0.7632218 | TRUE | 1 | -0.9216750 | #C17FFF |
| ENSG00000071242 | 0.7628319 | TRUE | 4 | -0.5362834 | #00C0BB |
| ENSG00000126705 | 0.7614435 | TRUE | 3 | -0.5362834 | #00C0BB |
| ENSG00000163635 | 0.7603906 | TRUE | 5 | 0.4727632 | #F365E6 |
| ENSG00000198752 | 0.7545511 | TRUE | 3 | -0.5362834 | #00C0BB |
| ENSG00000185345 | 0.7532615 | TRUE | 3 | -0.9216750 | #C17FFF |
| ENSG00000148737 | 0.7520793 | TRUE | 3 | -0.5362834 | #00C0BB |
| ENSG00000164061 | 0.7519603 | FALSE | None | -0.5362834 | #00C0BB |
| ENSG00000121297 | 0.7482135 | TRUE | 4 | 0.2020738 | #00B70C |
| ENSG00000176697 | 0.7390388 | TRUE | 5 | 0.4840783 | #00B6EB |
| ENSG00000100354 | 0.7378861 | TRUE | 2 | 0.4843988 | #8EAB00 |
| ENSG00000112640 | 0.7351863 | TRUE | 4 | -0.6557529 | #53B400 |
| ENSG00000139910 | 0.7307468 | FALSE | None | -0.9216750 | #C17FFF |
| ENSG00000107105 | 0.7228099 | TRUE | 4 | -0.9216750 | #C17FFF |
| ENSG00000157014 | 0.7213163 | FALSE | None | -0.5362834 | #00C0BB |
| ENSG00000178568 | 0.7208969 | TRUE | 5 | -0.5986466 | #4B9FFF |
| ENSG00000170043 | 0.7178050 | FALSE | None | -0.5362834 | #00C0BB |
| ENSG00000132821 | 0.7165198 | FALSE | None | -0.6714074 | #8195FF |
| ENSG00000187555 | 0.7159583 | TRUE | 2 | -0.6557529 | #53B400 |
| ENSG00000170485 | 0.7103425 | TRUE | 4 | -0.5362834 | #00C0BB |
| ENSG00000116254 | 0.7067786 | TRUE | 5 | -0.5986466 | #4B9FFF |
| ENSG00000137075 | 0.7033723 | TRUE | 4 | -0.5362834 | #00C0BB |
| ENSG00000132535 | 0.7028420 | TRUE | 5 | -0.6714074 | #8195FF |
| ENSG00000135365 | 0.7020656 | TRUE | 4 | 0.2020738 | #00B70C |
| ENSG00000166206 | 0.7005120 | TRUE | 2 | -0.1535426 | #FF699D |
| ENSG00000095787 | 0.7004685 | TRUE | 2 | -0.1165650 | #FD6F86 |
Running the model on all non-SFARI genes
negative_set = dataset %>% filter(!SFARI) %>% dplyr::select(-SFARI)
rownames(negative_set) = rownames(dataset)[!dataset$SFARI]
negative_set$prob = predict(fit, newdata=negative_set, type='response')
negative_set$pred = negative_set$prob>0.5
negative_set = negative_set %>% apply_labels(prob = 'Assigned Probability',
pred = 'Label Prediction')
cro(negative_set$pred)
| #Total | |
|---|---|
| Label Prediction | |
| FALSE | 9064 |
| TRUE | 4975 |
| #Total cases | 14039 |
cat(paste0('\n', sum(negative_set$pred), ' genes are predicted as ASD-related'))
##
## 4975 genes are predicted as ASD-related
negative_set %>% ggplot(aes(prob)) + geom_density(color='#F8766D', fill='#F8766D', alpha=0.5) +
geom_vline(xintercept=0.5, color='#666666', linetype='dashed') +
ggtitle('Probability distribution of all the Negative samples in the dataset') +
theme_minimal()
There’s a lot of noise, but the genes with the highest probabilities have slightly higher (absolute) Gene Significance
negative_set %>% ggplot(aes(prob, GS, color=MTcor)) + geom_point() + geom_smooth(method='loess', color='#666666') +
geom_hline(yintercept=0, color='gray', linetype='dashed') +
scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) +
ggtitle('Relation between Probability and Gene Significance') + theme_minimal()
negative_set %>% ggplot(aes(prob, abs(GS), color=MTcor)) + geom_point() + geom_smooth(method='loess', color='#666666') +
scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) +
ggtitle('Relation between Model probability and Gene Significance') + theme_minimal()
On average, the genes belonging to the modules with the highest correlation to ASD are assigned a lower probability by the model (???)
negative_set %>% ggplot(aes(MTcor, prob, color=GS)) + geom_point() + geom_smooth(method='loess', color='#666666') +
scale_color_gradientn(colours=c('#F8766D','#F8766D','white','#00BFC4','#00BFC4')) +
xlab('Modules ordered by their correlation to ASD') + ylab('Model probability') +
theme_minimal()
Summarised version, plotting by module instead of by gene
The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same
plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob), n = n()) %>%
mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>% left_join(original_dataset, by='MTcor') %>%
dplyr::select(matches(clustering_selected), MTcor, MTcor_sign, mean, sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'
ggplotly(plot_data %>% ggplot(aes(MTcor, mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID)) +
geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) +
xlab('Module-Diagnosis correlation') + ylab('Mean Probability by Model') + theme_minimal())
There is a positive relation between level of expression and probability, the model seems to be capturing indirectly the level of expression of the genes to make the prediction, so it’s introducing the same bias
# Gandal dataset
load('./../Data/Gandal/preprocessed_data_non_standardised_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))
plot_data = negative_set %>% mutate(ID=rownames(negative_set)) %>% left_join(mean_and_sd, by='ID') %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>%
dplyr::select(ID, matches(clustering_selected)), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'
plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.3) +
geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) +
theme_minimal() + ggtitle('Mean expression vs model probability by gene')
rm(mean_and_sd)
plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob), n=n())
ggplotly(plot_data2 %>% ggplot(aes(meanExpr, meanProb, size=n)) + geom_point(color=plot_data2$Module) +
geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) + theme_minimal() + theme(legend.position='none') +
ggtitle('Mean expression vs model probability by Module'))
rm(plot_data2)
There is also a positive relation between the standard deviation of a gene and its regression score, the model could be capturing this characteristic of the genes to make the prediction, and could be introducing bias
plot_data %>% filter(sdExpr<0.5) %>% ggplot(aes(sdExpr, prob)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.2) +
geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) +
theme_minimal() + ggtitle('SD vs model probability by gene')
This approximation curve looks like the opposite of the trend found between mean/sd and model scores
plot_data %>% ggplot(aes(meanExpr, sdExpr)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.3) +
geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) +
scale_x_log10() + scale_y_log10() +
theme_minimal() + ggtitle('Mean expression vs SD by gene')
There is a relation between probability and lfc, so it IS capturing a bit of true information (because lfc and mean expression were negatively correlated and it still has a positive relation in the model)
plot_data = negative_set %>% mutate(ID=rownames(negative_set)) %>%
left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID')
plot_data %>% filter(abs(log2FoldChange)<10) %>%
ggplot(aes(log2FoldChange, prob)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.3) +
theme_minimal() + ggtitle('lfc vs model probability by gene')
Original conclusion: The model is capturing the mean level of expression of the genes (indirectly through module memberhsip), which is a strong bias found in the SFARI scores, but it seems to be capturing a bit of true biological signal as well (based on the GS and the log fold change plots) …
New conclusion: The model seems to be capturing some sort of confounding variable to make the predictions, it would seem that it’s related to the mean expression or the standard deviation of the genes, but in 10_10_20_data_preprocessing_standardising_expr.RData we standardised the dataset and it made no difference in the resulting clusterings, which means that there is some other behaviour related to the level of expression of a gene or its standard deviation that is biasing the classifier, but I don’t know what it could be or how to fix it …
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] expss_0.9.1 ROCR_1.0-7 gplots_3.0.1
## [4] Rtsne_0.15 RColorBrewer_1.1-2 gridExtra_2.3
## [7] viridis_0.5.1 viridisLite_0.3.0 plotly_4.8.0
## [10] knitr_1.22 forcats_0.3.0 stringr_1.4.0
## [13] dplyr_0.8.0.1 purrr_0.3.1 readr_1.3.1
## [16] tidyr_0.8.3 tibble_2.1.1 ggplot2_3.1.0
## [19] tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 htmlTable_1.11.2
## [3] XVector_0.22.0 GenomicRanges_1.34.0
## [5] base64enc_0.1-3 rstudioapi_0.7
## [7] bit64_0.9-7 AnnotationDbi_1.44.0
## [9] lubridate_1.7.4 xml2_1.2.0
## [11] splines_3.5.2 geneplotter_1.60.0
## [13] Formula_1.2-3 jsonlite_1.6
## [15] Cairo_1.5-9 annotate_1.60.1
## [17] broom_0.5.1 cluster_2.0.7-1
## [19] shiny_1.2.0 compiler_3.5.2
## [21] httr_1.4.0 backports_1.1.2
## [23] assertthat_0.2.1 Matrix_1.2-15
## [25] lazyeval_0.2.2 cli_1.1.0
## [27] later_0.8.0 acepack_1.4.1
## [29] htmltools_0.3.6 tools_3.5.2
## [31] gtable_0.2.0 glue_1.3.1
## [33] GenomeInfoDbData_1.2.0 Rcpp_1.0.1
## [35] Biobase_2.42.0 cellranger_1.1.0
## [37] gdata_2.18.0 nlme_3.1-137
## [39] crosstalk_1.0.0 xfun_0.5
## [41] rvest_0.3.2 miniUI_0.1.1.1
## [43] mime_0.6 gtools_3.5.0
## [45] XML_3.98-1.11 zlibbioc_1.28.0
## [47] scales_1.0.0 hms_0.4.2
## [49] promises_1.0.1 parallel_3.5.2
## [51] SummarizedExperiment_1.12.0 yaml_2.2.0
## [53] memoise_1.1.0 rpart_4.1-13
## [55] ggExtra_0.8 RSQLite_2.1.1
## [57] latticeExtra_0.6-28 stringi_1.4.3
## [59] genefilter_1.64.0 highr_0.8
## [61] S4Vectors_0.20.1 checkmate_1.8.5
## [63] caTools_1.17.1 BiocGenerics_0.28.0
## [65] BiocParallel_1.16.6 GenomeInfoDb_1.18.2
## [67] rlang_0.3.2 pkgconfig_2.0.2
## [69] matrixStats_0.54.0 bitops_1.0-6
## [71] evaluate_0.13 lattice_0.20-38
## [73] htmlwidgets_1.3 labeling_0.3
## [75] bit_1.1-14 tidyselect_0.2.5
## [77] plyr_1.8.4 magrittr_1.5
## [79] DESeq2_1.22.2 R6_2.4.0
## [81] IRanges_2.16.0 generics_0.0.2
## [83] Hmisc_4.1-1 DBI_1.0.0
## [85] DelayedArray_0.8.0 pillar_1.3.1
## [87] haven_1.1.1 foreign_0.8-71
## [89] withr_2.1.2 survival_2.43-3
## [91] RCurl_1.95-4.10 nnet_7.3-12
## [93] modelr_0.1.4 crayon_1.3.4
## [95] KernSmooth_2.23-15 rmarkdown_1.12
## [97] locfit_1.5-9.1 grid_3.5.2
## [99] readxl_1.1.0 data.table_1.12.0
## [101] blob_1.1.1 digest_0.6.18
## [103] xtable_1.8-3 httpuv_1.5.0
## [105] stats4_3.5.2 munsell_0.5.0